{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Special thanks to https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'bert-base-bahasa-cased1'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AlbertTokenizer, BertModel, BertConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "directory = 'bert-base-v3'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('bert-base-bahasa-cased/spiece.model',\n",
       " 'bert-base-bahasa-cased/special_tokens_map.json',\n",
       " 'bert-base-bahasa-cased/added_tokens.json')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = AlbertTokenizer('sp10m.cased.bert.model', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)\n",
    "tokenizer.save_pretrained('bert-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AlbertTokenizer.from_pretrained('./bert-base-bahasa-cased', unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli convert --model_type bert \\\n",
    "#   --tf_checkpoint bert-base-v3/model.ckpt \\\n",
    "#   --config bert-base-v3/config.json \\\n",
    "#   --pytorch_dump_output bert-base-bahasa-cased/pytorch_model.bin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "config = BertConfig(f'{directory}/config.json')\n",
    "config.vocab_size = 32000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('./bert-base-bahasa-cased/pytorch_model.bin', config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan rendang[SEP]',\n",
       "  'score': 0.10812027007341385,\n",
       "  'token': 2446},\n",
       " {'sequence': '[CLS] makan ayam dengan kicap[SEP]',\n",
       "  'score': 0.07653367519378662,\n",
       "  'token': 12928},\n",
       " {'sequence': '[CLS] makan ayam dengan nasi[SEP]',\n",
       "  'score': 0.06839974224567413,\n",
       "  'token': 450},\n",
       " {'sequence': '[CLS] makan ayam dengan ayam[SEP]',\n",
       "  'score': 0.059544261544942856,\n",
       "  'token': 638},\n",
       " {'sequence': '[CLS] makan ayam dengan sayur[SEP]',\n",
       "  'score': 0.05294966697692871,\n",
       "  'token': 1639}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('bert-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./bert-base-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d5b030884ab948d8b7eb98e1f2c09ef1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Downloading', max=1023, style=ProgressStyle(description_width…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8d01969e77004b108aa17c9dd825656a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Downloading', max=445014763, style=ProgressStyle(description_…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model = BertModel.from_pretrained('huseinzol05/bert-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AlbertTokenizer.from_pretrained('huseinzol05/bert-base-bahasa-cased', \n",
    "                                            unk_token='[UNK]',pad_token='[PAD]', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = torch.tensor([tokenizer.encode(\"husein tk suka mkan ayam\", add_special_tokens=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-1.1787,  0.3581, -2.2697,  ..., -0.4730, -0.5218, -1.5747],\n",
       "         [ 0.3075, -1.7360, -1.1151,  ...,  0.7858, -0.3618, -1.2349],\n",
       "         [-0.2988, -1.0545, -2.1584,  ..., -0.5917, -1.2333, -0.7880],\n",
       "         ...,\n",
       "         [-0.1735,  0.0953, -0.6671,  ...,  0.9276, -1.0623, -0.4068],\n",
       "         [ 0.9310,  1.0339, -0.2825,  ...,  1.4118,  0.7078, -1.3224],\n",
       "         [-0.8547, -0.3130, -2.2146,  ..., -0.8210, -0.0679, -1.2889]]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    last_hidden_states = model(input_ids)[0]\n",
    "    \n",
    "last_hidden_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': '[CLS] makan ayam dengan rendang[SEP]',\n",
       "  'score': 0.10812027007341385,\n",
       "  'token': 2446},\n",
       " {'sequence': '[CLS] makan ayam dengan kicap[SEP]',\n",
       "  'score': 0.07653367519378662,\n",
       "  'token': 12928},\n",
       " {'sequence': '[CLS] makan ayam dengan nasi[SEP]',\n",
       "  'score': 0.06839974224567413,\n",
       "  'token': 450},\n",
       " {'sequence': '[CLS] makan ayam dengan ayam[SEP]',\n",
       "  'score': 0.059544261544942856,\n",
       "  'token': 638},\n",
       " {'sequence': '[CLS] makan ayam dengan sayur[SEP]',\n",
       "  'score': 0.05294966697692871,\n",
       "  'token': 1639}]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('huseinzol05/bert-base-bahasa-cased')\n",
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)\n",
    "fill_mask('makan ayam dengan [MASK]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
